[1] 6471 113
Techniques for Avoiding Overfitting, Pt. 1
Feature Engineering
Behavioral Data Science in R II
Unit 2
Module 6
n_obs = 1000
sim_lin_df =
tibble(
x1 = rnorm(n_obs,0,5),
x2 = rnorm(n_obs,0,10),
x3 = rnorm(n_obs,0,15),
y = 4.17 + 8.06*x1 + 8.06*x2 + 8.06*x3 + rnorm(n_obs,0,10)
) %>%
mutate(
noise1 = rnorm(n_obs,0,10),
noise2 = rnorm(n_obs,0,10),
perfectCorr_x2 = 2.55 + 3.94*x2,
highCorr_x3 = 7.92 + 4.37*x3 + rnorm(n_obs,0,40),
linComb_x2_x3 = 17 + 3.1*x2 + 4.7*x3, # x6 is a linear combination of x1 and x2
zeroVar = rep(1.24, n_obs), # x5 has zero variance
) [1] "Corr(x3, highCorr_x3):"
[1] 0.8506153
Kept
[1] "x1" "x2" "x3" "noise1"
[5] "noise2" "perfectCorr_x2" "highCorr_x3" "linComb_x2_x3"
[9] "y"
Removed
[1] "zeroVar"
Kept
[1] "x1" "x2" "x3" "noise1"
[5] "noise2" "perfectCorr_x2" "highCorr_x3" "y"
Removed
[1] "linComb_x2_x3" "zeroVar"
Kept
[1] "x1" "x3" "noise1" "noise2"
[5] "perfectCorr_x2" "highCorr_x3" "y"
Removed
[1] "x2" "linComb_x2_x3" "zeroVar"
lr_fit <- workflow() %>%
add_recipe(rec) %>%
add_model(linear_reg()) %>%
fit(sim_lin_df)
tidy(lr_fit) %>% arrange(desc(p.value))# A tibble: 6 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 noise1 0.000322 0.0315 0.0102 0.992
2 noise2 0.0453 0.0318 1.43 0.154
3 (Intercept) -1.14 0.316 -3.59 0.000342
4 x1 8.06 0.0636 127. 0
5 x3 8.10 0.0214 378. 0
6 perfectCorr_x2 2.04 0.00797 256. 0
rec2 <- rec %>%
step_rm(all_of("noise2"))
lr_fit <- workflow() %>%
add_recipe(rec2) %>%
add_model(linear_reg()) %>%
fit(sim_lin_df)
tidy(lr_fit) %>% arrange(desc(p.value))# A tibble: 5 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 noise1 0.00117 0.0315 0.0371 0.970
2 (Intercept) -1.12 0.316 -3.54 0.000423
3 x1 8.06 0.0636 127. 0
4 x3 8.09 0.0214 378. 0
5 perfectCorr_x2 2.04 0.00796 257. 0
rec2 <- rec %>%
step_rm(all_of(c("noise2","noise1")))
lr_fit <- workflow() %>%
add_recipe(rec2) %>%
add_model(linear_reg()) %>%
fit(sim_lin_df)
tidy(lr_fit) %>% arrange(desc(p.value))# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -1.12 0.316 -3.54 0.000413
2 x1 8.06 0.0636 127. 0
3 x3 8.09 0.0214 378. 0
4 perfectCorr_x2 2.04 0.00793 258. 0
A.k.a.:
pca_rec <- recipe(y ~ ., sim_lin_df) %>%
step_nzv(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 3)
pca_estimates <- prep(pca_rec, sim_lin_df)
pca_data <- bake(pca_estimates, sim_lin_df)
tidy(pca_estimates, number=3)# A tibble: 64 × 4
terms value component id
<chr> <dbl> <chr> <chr>
1 x1 0.0217 PC1 pca_EoYnc
2 x2 0.317 PC1 pca_EoYnc
3 x3 0.494 PC1 pca_EoYnc
4 noise1 0.0299 PC1 pca_EoYnc
5 noise2 -0.0518 PC1 pca_EoYnc
6 perfectCorr_x2 0.317 PC1 pca_EoYnc
7 highCorr_x3 0.469 PC1 pca_EoYnc
8 linComb_x2_x3 0.575 PC1 pca_EoYnc
9 x1 0.132 PC2 pca_EoYnc
10 x2 0.595 PC2 pca_EoYnc
# ℹ 54 more rows
# A tibble: 8 × 4
terms value component id
<chr> <dbl> <int> <chr>
1 cumulative percent variance 36.3 1 pca_EoYnc
2 cumulative percent variance 60.7 2 pca_EoYnc
3 cumulative percent variance 73.3 3 pca_EoYnc
4 cumulative percent variance 85.7 4 pca_EoYnc
5 cumulative percent variance 97.6 5 pca_EoYnc
6 cumulative percent variance 100 6 pca_EoYnc
7 cumulative percent variance 100 7 pca_EoYnc
8 cumulative percent variance 100 8 pca_EoYnc
# A tibble: 22 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0 rmse standard 9.85 5 0.254 Preprocessor1_Model01
2 0 rsq standard 0.996 5 0.000467 Preprocessor1_Model01
3 0.1 rmse standard 9.85 5 0.254 Preprocessor1_Model02
4 0.1 rsq standard 0.996 5 0.000467 Preprocessor1_Model02
5 0.2 rmse standard 9.85 5 0.254 Preprocessor1_Model03
6 0.2 rsq standard 0.996 5 0.000467 Preprocessor1_Model03
7 0.3 rmse standard 9.85 5 0.254 Preprocessor1_Model04
8 0.3 rsq standard 0.996 5 0.000467 Preprocessor1_Model04
9 0.4 rmse standard 9.85 5 0.254 Preprocessor1_Model05
10 0.4 rsq standard 0.996 5 0.000467 Preprocessor1_Model05
# ℹ 12 more rows
# A tibble: 9 × 3
term estimate penalty
<chr> <dbl> <dbl>
1 (Intercept) 3.68e+ 0 0.8
2 x1 3.91e+ 1 0.8
3 x2 2.77e+ 1 0.8
4 x3 0 0.8
5 noise1 0 0.8
6 noise2 0 0.8
7 perfectCorr_x2 2.99e-12 0.8
8 highCorr_x3 0 0.8
9 linComb_x2_x3 1.27e+ 2 0.8